A house value is simply more than location and square footage. Like the features that make up a person, an educated party would want to know all aspects that give a house its value. For example, you want to sell a house and you don’t know the price which you can take — it can’t be too low or too high. To find house price you usually try to find similar properties in your neighbourhood and based on gathered data you will try to assess your house price.
#supress warning
import folium
from folium import plugins
from io import StringIO
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd
import os
import plotly.offline as py
from plotly import tools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
import plotly.figure_factory as ff
from IPython.core.display import display, HTML
init_notebook_mode(connected=True)
#Limiting floats output to 3 decimal points
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
import seaborn as sns
from sklearn import linear_model
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import iqr, zscore,norm
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist #Pairwise distribution between data points
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor)
from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split,KFold, cross_val_score,RandomizedSearchCV,GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
from xgboost.sklearn import XGBRegressor
from scipy.stats import randint as sp_randint
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Set CSS properties for th elements in dataframe
th_props = [
('font-size', '11px'),
('text-align', 'center'),
('font-weight', 'bold'),
('color', '#6d6d6d'),
('background-color', '#f7f7f9')
]
# Set CSS properties for td elements in dataframe
td_props = [
('font-size', '11px')
]
# Set table styles
styles = [
dict(selector="th", props=th_props),
dict(selector="td", props=td_props)
]
sns.set(style="ticks", color_codes=True, font_scale=1.5)
color = sns.color_palette()
sns.set_style('darkgrid')
#os.chdir(r"D:\Data Science\Great Lakes Project\CapstoneProject")
# Load the Diabetes Dataset
# reading the CSV file into pandas dataframe and parse date column
housing_df = pd.read_csv("innercity.csv")
house_df = housing_df.copy()
# Display first 10 rows of dataset
housing_df.head(10)
So now we have data, let's define steps before jumping into model building
housing_df.drop('cid',axis=1).describe().transpose()
housing_df.info()
Important questions when thinking about missing data:
The answer to these questions is important for practical reasons because missing data can imply a reduction of the sample size. This can prevent us from proceeding with the analysis. Moreover, from a substantive perspective, we need to ensure that the missing data process is not biased and hidding an inconvenient truth.
#missing data => Find total rows having missing values and calculate the percentage of missing values for each field.
total = housing_df.isnull().sum().sort_values(ascending=False)
percent = (housing_df.isnull().sum()/housing_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Non-considerable zero value in field
Considerable zero value in fields
# Lets analyse zero values
for col in housing_df.columns:
if housing_df[col].dtypes == 'int64' or housing_df[col].dtypes == 'float64':
if np.count_nonzero(housing_df[col]==0):
print('Number of 0-entries for "{field_name}" feature:{amount}'.format(
field_name=col,amount=np.count_nonzero(housing_df[col]==0) ))
Outliers is also something that we should be aware of. Why? Because outliers can markedly affect our models and can be a valuable source of information, providing us insights about specific behaviours.
Outliers is a complex subject and it deserves more attention. Here, we'll just do a quick analysis through the standard deviation of 'price' and a set of scatter plots.
def checkIQR(data):
print("Attributes for which values lie outside of IQR")
for field in housing_df.columns:
if housing_df[field].dtypes == 'int64' or housing_df[field].dtypes == 'float64':
Q1 = data[field].quantile(0.25)
Q3 = data[field].quantile(0.75)
current_iqr = iqr(data[field], rng=(25,75), interpolation='midpoint')
iqr_analysis = (data[field] < (Q1 - 1.5 * current_iqr)) |(data[field] > (Q3 + 1.5 * current_iqr))
if (iqr_analysis == True).any() == True:
print('{field_name} : {flag}'.format(field_name=field,flag=sum(iqr_analysis)))
checkIQR(housing_df)
Below attributes are having high number of values as outliers
# Box plot to see outliers pattern in the data.
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)
fig1 = fig.add_subplot(221);
sns.boxplot(housing_df.total_area, data=housing_df)
fig2 = fig.add_subplot(222);
sns.boxplot(housing_df.lot_measure, data=housing_df)
fig3 = fig.add_subplot(223);
sns.boxplot(housing_df.lot_measure15, data=housing_df)
fig4 = fig.add_subplot(224);
sns.boxplot(housing_df.yr_renovated, data=housing_df)
# Let's convert dayhours with dddd/mm/yy and create new feature with year
housing_df.dayhours = housing_df.dayhours.str.extract(r'(\d{4}\d{2}\d{2})')
housing_df.dayhours = pd.to_datetime(housing_df.dayhours)
housing_df['yr_sold'] = housing_df.dayhours.dt.year
housing_df.head()
we have created below columns to understand the distribution of data.
1. types - data type of the feature.
2. counts - total count of records for that feature.
3. distincts - total number of distinct value for that feature.
4. uniques - all unique values in that feature.
5. skewness - skewness of the data for all features.
6. kurtosis - meaure of tailedness from the data for all features.
7. corr_price - correlation of each feature with target variable 'price'
sns.set(rc={'figure.figsize': (11, 8)})
sns.distplot(housing_df['price'],hist=True, kde=True)
Skewness is a measure of the symmetry in a distribution. ... It measures the amount of probability in the tails. The value is often compared to the kurtosis of the normal distribution, which is equal to 3. If the kurtosis is greater than 3, then the dataset has heavier tails than a normal distribution
print("Skewness: %f" % housing_df['price'].skew())
print("Kurtosis: %f" % housing_df['price'].kurt())
def rstr(df, pred=None):
obs = df.shape[0]
types = df.dtypes
counts = df.apply(lambda x: x.count())
uniques = df.apply(lambda x: [x.unique()])
nulls = df.apply(lambda x: x.isnull().sum())
distincts = df.apply(lambda x: x.unique().shape[0])
missing_ration = (df.isnull().sum()/ obs) * 100
skewness = df.skew()
kurtosis = df.kurt()
print('Data shape:', df.shape)
if pred is None:
cols = ['types', 'counts', 'distincts', 'uniques', 'skewness', 'kurtosis']
str = pd.concat([types, counts, distincts, uniques, skewness, kurtosis], axis = 1)
else:
corr = df.corr()[pred]
str = pd.concat([types, counts, distincts, uniques, skewness, kurtosis, corr], axis = 1)
corr_col = 'corr_' + pred
cols = ['types', 'counts', 'distincts','uniques', 'skewness', 'kurtosis', corr_col ]
str.columns = cols
dtypes = str.types.value_counts()
print('___________________________\n')
print('Data types:\n',str.types.value_counts())
print('___________________________')
return str
details = rstr(housing_df, 'price')
display(details.sort_values(by='corr_price', ascending=False))
Features high correlation to Sales Price
Features with good correlation to Sales Price
Features with week correlation to Sales Price
correlation coefficient measures a degree of relation between two variables, it only measures the linear relationship between the variables.
A correlation of -1.0 shows a perfect negative correlation, while a correlation of 1.0 shows a perfect positive
A correlation of 0.0 shows zero or no relationship between the movement of the two variables. From above stats we saw skewness and correlation of fields with price.
Let's understand the correlation from heatmap. This heatmap is the best way to get a quick overview to understand the relationship between dependent and independent variable.
At first sight, there are two "" colored squares that get my attention. The first one refers to the 'lot_messaure' and 'total_area' variables, and the second one refers to the 'living_measure' and 'ceil_mesaure' variables. Both cases show how significant the correlation is between these variables. Actually, this correlation is so strong that it can indicate a situation of multicollinearity. If we think about these variables, we can conclude that they give almost the same information so multicollinearity really occurs. Heatmaps are great to detect this kind of situations and in problems dominated by feature selection, like ours, they are an essential tool.
Another thing that got our attention was the 'price' correlations. We can see our well-known 'living_measure', 'Quality', and 'ceil_mesaure' saying a big 'Hi!', but we can also see many other variables that should be taken into account. That's what we will do next.
housing_df_drop = housing_df.drop('cid',axis=1)
corr = housing_df_drop.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (20,12))
with sns.axes_style("white"):
sns.heatmap(corr,annot=True,linewidth=1,mask = mask,vmax=1,vmin=-1,fmt='.2f')
plt.title("Correlation between variables")
plt.show()
tmp1 = housing_df[housing_df['yr_sold'] == 2014]['price']
tmp2 = housing_df[housing_df['yr_sold'] == 2015]['price']
hist_data = [tmp1, tmp2]
group_labels = ['House Price 2014', 'House Price 2015']
colors = ['#4256f4', '#FFD700']
fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
fig['layout'].update(title = 'Distribution Plot for House price 2014 and 2015 ')
py.iplot(fig)
Attributes are
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)
fig1 = fig.add_subplot(221);
sns.scatterplot(x = housing_df.living_measure, y = housing_df.price, data=housing_df)
fig2 = fig.add_subplot(222);
sns.boxplot(x=housing_df.quality, y=housing_df.price, data=housing_df)
fig3 = fig.add_subplot(223);
sns.scatterplot(x = housing_df.ceil_measure, y = housing_df.price, data=housing_df)
fig4 = fig.add_subplot(224);
sns.boxplot(x = housing_df.furnished, y = housing_df.price, data=housing_df)
fig5 = plt.figure(figsize=(20, 8))
fig6 = fig5.add_subplot(121);
sns.scatterplot(x = housing_df.room_bath, y = housing_df.price, data=housing_df)
fig7 = fig5.add_subplot(122);
sns.scatterplot(x = housing_df.living_measure15, y = housing_df.price, data=housing_df)
Attributes are
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)
fig1 = fig.add_subplot(221);
sns.boxplot(x = housing_df.sight, y = housing_df.price, data=housing_df)
fig2 = fig.add_subplot(222);
sns.scatterplot(x=housing_df.basement, y=housing_df.price, data=housing_df)
fig3 = fig.add_subplot(223);
sns.boxplot(x = housing_df.room_bed, y = housing_df.price, data=housing_df)
fig4 = fig.add_subplot(224);
sns.scatterplot(x = housing_df.lat, y = housing_df.price, data=housing_df)
fig5 = plt.figure(figsize=(17, 8))
fig6 = fig5.add_subplot(121);
sns.boxplot(x = housing_df.coast, y = housing_df.price, data=housing_df)
fig7 = fig5.add_subplot(122);
sns.boxplot(x = housing_df.ceil, y = housing_df.price, data=housing_df)
We are not analysing cid and dayhours.cid because it just an unique id and dayhours as we have created new column yr_sold
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)
fig1 = fig.add_subplot(221);
sns.boxplot(x = housing_df.condition, y = housing_df.price, data=housing_df)
fig2 = fig.add_subplot(222);
sns.scatterplot(x=housing_df.total_area, y=housing_df.price, data=housing_df)
fig3 = fig.add_subplot(223);
sns.scatterplot(x = housing_df.lot_measure, y = housing_df.price, data=housing_df)
fig4 = fig.add_subplot(224);
sns.scatterplot(x = housing_df.lot_measure15, y = housing_df.price, data=housing_df)
fig = plt.figure(figsize=(20, 15))
sns.set(font_scale=1.5)
fig1 = fig.add_subplot(221);
sns.scatterplot(x = housing_df.yr_built, y = housing_df.price, data=housing_df)
fig2 = fig.add_subplot(222);
sns.boxplot(x=housing_df.yr_sold, y=housing_df.price, data=housing_df)
fig3 = fig.add_subplot(223);
sns.scatterplot(x = housing_df.yr_renovated, y = housing_df.price, data=housing_df)
fig4 = fig.add_subplot(224);
sns.scatterplot(x = housing_df.zipcode, y = housing_df.price, data=housing_df)
fig5 = plt.figure(figsize=(17, 8))
fig6 = fig5.add_subplot(121);
sns.scatterplot(x = housing_df.long, y = housing_df.price, data=housing_df)
As from below heatmap we found that
So, we can drop living_measure15,lot_measure15 & total_area
# As we have seen living_measure and lot_measure is having coreleation with price.
# Let's see if we have corelation between these attributes through heatmap
housing_df_space = housing_df[['living_measure','lot_measure','living_measure15','lot_measure15','total_area']].copy()
corr = housing_df_space.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True,vmax=1,vmin=-1)
What has been revealed:
#bivariate analysis price/lot_measure
var = 'lot_measure'
data = pd.concat([housing_df['price'], housing_df[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,800000));
lot_measure15 : lot_measure15 has many of the datapoint set zero. The column is added because after renivation area of house is changed. Also if lot_measure15 increases after renovation prices also tend to increase
What has been revealed:
#bivariate analysis price/lot_measure15
var = 'lot_measure15'
data = pd.concat([housing_df['price'], housing_df[var]], axis=1)
data.plot.scatter(x=var, y='price', ylim=(0,800000));
So as predicted all locations from Mercer Island, United States of America.
Now lets evaludate if we can see any cluster so that we merge both lat & long and create single feature.But once we created cluster we have to evaluate corelation with Price
location = folium.Map([house_df['lat'].mean(), house_df['long'].mean()], zoom_start=15,tiles='OpenStreetMap')
location
#histogram to undertand the distribution of latitude
sns.set(rc={'figure.figsize': (11, 8)})
sns.distplot(housing_df['lat'],hist=True, kde=True);
# hist plot to understand the distribtion of longitude.
sns.set(rc={'figure.figsize': (11, 8)})
sns.distplot(housing_df['long'],hist=True, kde=True);
# create_bins function creates clusters of data based on centroids.
# Input Args :
# N - number of clusters.
# var - variable field from dataframe.
# var_cat - new variable field name to be added in the dataframe.
def create_bins(N,var,var_cat) :
bins_df = housing_df[[var]].copy()
bins_df_scaled = bins_df.apply(zscore)
bins_df_scaled = pd.DataFrame(bins_df_scaled, columns=bins_df.columns)
bins_df_scaled.head()
wcss = []
for k in range(1,15):
kmeans = KMeans(n_clusters=k)
kmeans.fit(bins_df_scaled)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(20,8))
plt.title("WCSS / K Chart", fontsize=18)
plt.plot(range(1,15),wcss,"-o")
plt.grid(True)
plt.xlabel("Amount of Clusters",fontsize=14)
plt.ylabel("Inertia",fontsize=14)
plt.xticks(range(1,20))
plt.tight_layout()
plt.show()
kmeans = KMeans(n_clusters=N, n_init = 5, random_state=12345)
kmeans.fit(bins_df_scaled)
# Check the number of data in each cluster
labels = kmeans.labels_
counts = np.bincount(labels[labels>=0])
# print(counts)
centroids = kmeans.cluster_centers_
centroid_df = pd.DataFrame(centroids, columns = list(bins_df_scaled))
print('centroid values')
print(centroid_df.transpose())
predictions = kmeans.predict(bins_df_scaled)
print(predictions)
housing_df[var_cat] = predictions
housing_df[var_cat] = housing_df[var_cat].astype('category')
print(housing_df.dtypes)
create_bins(6,'lat','location')
sns.set(rc={'figure.figsize': (11, 8)})
sns.boxplot(housing_df['location'], housing_df['price'])
So as predicted all locations from Mercer Island, United States of America.
Now lets evaluate if we can see any cluster so that we merge both lat & long and create single feature.But once we created cluster we have to evaluate corelation with Price
sns.set(rc={'figure.figsize': (11.7, 8.27)})
sns.set_style('whitegrid')
ax = sns.countplot(x='location', data=housing_df, palette=sns.color_palette('Blues'))
ax.set(xlabel='Location', ylabel='Count')
plt.show()
housing_df_price = housing_df.groupby('location')['price'].mean()
housing_df_price
create_bins(6,'room_bath','room_bath_cat')
housing_df.groupby(['room_bath_cat'])['room_bath'].mean()
# adding new column 'age' in the housing dataframe, 'age' tells when the houses were sold.
housing_df['age']=housing_df['yr_sold'].astype(int)-housing_df['yr_built']
So as expected year of built and year of sold is same that's why age for many data values is coming as zero. We can note down these houses for further analysis as these can be best for buyer as well as seller.
housing_df['age_renovated']=0
housing_df['age_renovated']=housing_df['yr_sold'][housing_df['yr_renovated']!=0].astype(int)-housing_df['yr_renovated'][housing_df['yr_renovated']!=0]
housing_df['age_renovated'][housing_df['age_renovated'].isnull()]=0
# partition the age into bins
bins = [-2,0,5,10,25,50,75,100,500]
labels = ['<1','1-5','6-10','11-25','26-50','51-75','76-100','>100']
housing_df['age_binned'] = pd.cut(housing_df['age'], bins=bins, labels=labels)
# partition the age_rnv into bins
bins = [-2,0,5,10,25,50,75,100000]
labels = ['0-1','1-5','6-10','11-25','26-50','51-75','>75']
housing_df['age_renovated_binned'] = pd.cut(housing_df['age_renovated'], bins=bins, labels=labels)
f, axes = plt.subplots(1, 2,figsize=(15,5))
p1=sns.countplot(x='age_binned',data=housing_df,ax=axes[0],hue='condition')
sns.countplot(x='age_renovated_binned',data=housing_df,ax=axes[1],hue='furnished')
#histogram
sns.set(rc={'figure.figsize': (11, 8)})
sns.distplot(housing_df['age'],hist=True, kde=True);
sns.scatterplot(x = housing_df.age, y = housing_df.price, data=housing_df)
sns.jointplot(x='age', y="price", data=housing_df, kind = 'reg', height = 5)
sns.jointplot(x='yr_built', y="price", data=housing_df, kind = 'reg', height = 5)
p1=sns.countplot(x='age_binned',data=housing_df)
sns.scatterplot(x = housing_df.age_binned, y = housing_df.price, data=housing_df)
Condition tells how the house is (Overall).When it says overall it may be including all aspects like strength,painting,wiring etc.
So as house buyer this should be one of key feature for selecting any house and its price.Lets evaluate why it is very weak predictor of price.
sns.boxplot(x = housing_df.condition, y = housing_df.price, data=housing_df)
sns.boxplot(x = housing_df.condition, y = housing_df.age, data=housing_df)
#f, axes = plt.subplots(1, 1,figsize=(15,5))
p1=sns.countplot(x='yr_sold',hue='condition',data=housing_df)
housing_df_copy = housing_df[housing_df['condition'] >= 3]
housing_df_copy['condition'].corr(housing_df_copy['price'])
So as we see data is not balanced.We have more number of values with condition 3 with respect to other condition values.Salesprice is getting reduced with higher condition value.But we can not consider this negative correlation.
For now we will not remove it, will evaluate further.
X = house_df.drop({'price','dayhours','cid','total_area'}, axis=1)
y = house_df[['price']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=1)
LRM = LinearRegression().fit(X_train, y_train)
print('Train Score:',LRM.score(X_train, y_train))
print('Test Score: ',LRM.score(X_test, y_test))
print('Intercept: ',LRM.intercept_)
From above modelling, we see linear regression model is givng less accuracy, so we need to verify whether all linear regression assumption are true.
According to Hair et al. (2013), four assumptions should be tested:
Normality - When we talk about normality what we mean is that the data should look like a normal distribution. This is important because several statistic tests rely on this (e.g. t-statistics). In this exercise we'll just check univariate normality for 'price' (which is a limited approach). Remember that univariate normality doesn't ensure multivariate normality (which is what we would like to have), but it helps. Another detail to take into account is that in big samples (>200 observations) normality is not such an issue. However, if we solve normality, we avoid a lot of other problems (e.g. heteroscedacity) so that's the main reason why we are doing this analysis.
Homoscedasticity - Homoscedasticity refers to the 'assumption that dependent variable(s) exhibit equal levels of variance across the range of predictor variable(s)' (Hair et al., 2013). Homoscedasticity is desirable because we want the error term to be the same across all values of the independent variables.
Linearity - The most common way to assess linearity is to examine scatter plots and search for linear patterns. If patterns are not linear, it would be worthwhile to explore data transformations. However, we'll not get into this because most of the scatter plots we've seen appear to have linear relationships.
Absence of correlated errors(Multicollineraty) - Correlated errors, like the definition suggests, happen when one error is correlated to another. For instance, if one positive error makes a negative error systematically, it means that there's a relationship between these variables. This occurs often in time series, where some patterns are time related. We'll also not get into this. However, if you detect something, try to add a variable that can explain the effect you're getting. That's the most common solution for correlated errors.
num_data = X._get_numeric_data()
pd.Series([variance_inflation_factor(num_data.values, i)
for i in range(num_data.shape[1])],
index=num_data.columns)
X = house_df.drop({'price','dayhours','cid','total_area','dayhours'}, axis=1)
y = house_df[['price']]
X_colli = X[['living_measure','living_measure15', 'ceil_measure', 'basement','lat','long','zipcode']]
X = X.drop({'living_measure','living_measure15', 'ceil_measure', 'basement','lat','long','zipcode'},axis=1)
pca = PCA()
X_colli=pca.fit_transform(X_colli)
X_colli = pd.DataFrame(X_colli,columns=['living_measure','living_measure15', 'ceil_measure', 'basement','lat','long','zipcode'])
X_con = X.join(X_colli)
X_Pca_train, X_Pca_test, y_Pca_train, y_Pca_test = train_test_split(X_con, y, test_size=0.30 , random_state=1)
LRM = LinearRegression().fit(X_Pca_train, y_Pca_train)
print('Train Score:',LRM.score(X_Pca_train, y_Pca_train))
print('Test Score: ',LRM.score(X_Pca_test, y_Pca_test))
print('Intercept: ',LRM.intercept_)
df_residplot_features = ['living_measure','lot_measure','ceil_measure', 'yr_built', 'lat', 'long', 'living_measure15', 'lot_measure15']
plt.figure(figsize=(20,37))
gs = gridspec.GridSpec(7,3)
for i, cn in enumerate(housing_df[df_residplot_features]):
ax = plt.subplot(gs[i])
sns.residplot(housing_df[cn],housing_df['price'],ax=ax)
ax.set_title(str(cn)[0:])
ax.set_ylabel(' ')
ax.set_xlabel(' ')
The point here is to test 'price' in a very lean way. We'll do this by paying attention to:
# plotting histogram to check the normal distribution for 'price','living_measure','lot_measure','ceil_measure', 'yr_built',
#'zipcode','lat', 'long', 'living_measure15', 'lot_measure15' fields
df_features = ['price','living_measure','lot_measure','ceil_measure', 'yr_built', 'zipcode','lat', 'long', 'living_measure15', 'lot_measure15']
plt.figure(figsize=(20,37))
gs = gridspec.GridSpec(7,3)
for i, cn in enumerate(housing_df[df_features]):
ax = plt.subplot(gs[i])
sns.distplot(housing_df[cn], fit=norm)
ax.set_title(str(cn)[0:])
ax.set_ylabel(' ')
ax.set_xlabel(' ')
# using probplot to find the best fit line for
#'price','living_measure','lot_measure','ceil_measure', 'yr_built', 'zipcode','lat', 'long', 'living_measure15',
# 'lot_measure15' fields
plt.figure(figsize=(20,37))
gs = gridspec.GridSpec(7,3)
for i, cn in enumerate(housing_df[df_features]):
ax = plt.subplot(gs[i])
stats.probplot(housing_df[cn], plot=plt)
ax.set_title(str(cn)[0:])
#Skip the log transformation as of now
# We are not normalising lat and long because it is coordinates.We can convert lat & long to x,y & z and then normalize it.
#x = cos(lat) * cos(lon)
#y = cos(lat) * sin(lon),
#z = sin(lat)
housing_df['price'] = np.log(housing_df['price'])
housing_df['living_measure'] = np.log(housing_df['living_measure'])
housing_df['lot_measure'] = np.log(housing_df['lot_measure'])
housing_df['ceil_measure'] = np.log(housing_df['ceil_measure'])
housing_df['living_measure15'] = np.log(housing_df['living_measure15'])
housing_df['lot_measure15'] = np.log(housing_df['lot_measure15'])
# plotting histogram after log transformation to check the distribution
log_features =['price','living_measure','lot_measure','ceil_measure', 'zipcode','lat', 'living_measure15', 'lot_measure15']
plt.figure(figsize=(20,35))
gs = gridspec.GridSpec(7,3)
for i, cn in enumerate(housing_df[log_features]):
ax = plt.subplot(gs[i])
sns.distplot(housing_df[cn], fit= norm)
ax.set_title(str(cn)[0:])
ax.set_ylabel(' ')
ax.set_xlabel(' ')
plt.figure(figsize=(20,37))
gs = gridspec.GridSpec(7,3)
for i, cn in enumerate(housing_df[log_features]):
ax = plt.subplot(gs[i])
stats.probplot(housing_df[cn], plot=plt)
ax.set_title(str(cn)[0:])
ax.set_ylabel(' ')
ax.set_xlabel(' ')
The best approach to test homoscedasticity for two metric variables is graphically. Departures from an equal dispersion are shown by such shapes as cones (small dispersion at one side of the graph, large dispersion at the opposite side) or diamonds (a large number of points at the center of the distribution).
housing_df.columns
# Copying from below point to replace with label encoder for the categorical columns, move to bottom
housing_df_model = housing_df.drop({'dayhours','cid','total_area','zipcode'}, axis=1)
##housing_df_model = pd.get_dummies(housing_df_model, columns = ['location','ceil', 'coast', 'sight', 'quality','furnished','condition','room_bath_cat','age_binned','age_renovated_binned'])
print( housing_df_model.columns)
housing_df_model= housing_df_model.drop({'age','age_renovated','lat','long','room_bath','yr_sold'},axis=1)
housing_df_model_later = housing_df_model.copy
housing_df_model = pd.get_dummies(housing_df_model, columns = ['location','ceil', 'coast', 'sight', 'quality','furnished','condition','room_bath_cat','age_binned','age_renovated_binned'])
print( housing_df_model.columns)
X_reg = housing_df_model.drop({'price'}, axis=1)
y_reg = housing_df_model[['price']]
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.30 , random_state=1)
regression_model = LinearRegression()
rm = regression_model.fit(X_train, y_train)
print(X_train.shape)
print(X_test.shape)
print('Train Score')
print(' ')
print(rm.score(X_train, y_train))
print('-----------')
print('Test Score')
print(' ')
print(rm.score(X_test, y_test))
X = housing_df_model.iloc[:,1:].values
y = housing_df_model.iloc[:,0].values
X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=1)
model = sm.OLS(endog=y_train,exog=X_train)
results = model.fit()
print(results.summary())
# coefficient of each Independent variable
for idx, col_name in enumerate(housing_df_model.columns[0:]):
if idx==0:
print("The coefficient for {} is {}".format("Intercept", results.params[idx]))
else:
print("The coefficient for {} is {}".format(col_name, results.params[idx]))
def abline(slope, intercept):
gca = plt.gca()
gca.set_autoscale_on(False)
x_vals = np.array(gca.get_xlim())
y_vals = intercept + slope * x_vals
plt.plot(x_vals, y_vals, '--')
#fit an OLS model to data
model = sm.OLS(y_train,X_train)
results = model.fit()
#predict y values for training data
y_hat = model.predict(results.params)
#plot predicted vs actual
plt.plot(y_hat,y_train,'o')
plt.xlabel("Predicted")#,color='white')
plt.ylabel("Actual")#,color='white')
plt.title('Predicted vs. Actual: Visual Linearity Test')#,color='white')
plt.tick_params(axis='x', colors='white')
plt.tick_params(axis='y', colors='white')
abline(1,0)
plt.show()
results.pvalues.max()
# configure bootstrap
n_iterations = 1000
#n_size = int(len(X) * 0.50)
#values = housing_df_model.values
regression_model = LinearRegression()
# run bootstrap
stats = list()
for i in range(n_iterations):
# prepare train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg, test_size=0.30 , random_state=i)
# fit model
rm = regression_model.fit(X_train, y_train)
# evaluate model
score = rm.score(X_test, y_test)
#print(score)
stats.append(score)
# plot scores
plt.hist(stats)
plt.show()
# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
sample_2 = X_test.head(1)
print('an example where living measure is =' + str(sample_2['living_measure'].values))
print('pedicted house value is ')
print(np.exp(rm.predict(sample_2)))
value1 = np.exp(rm.predict(sample_2))
print('--------------------------------')
sample_2['living_measure'] = sample_2['living_measure'] + 1
print('have changed living measure for the same example and value of living measure is =' + str(sample_2['living_measure'].values))
print('pedicted house value this time is ')
print(np.exp(rm.predict(sample_2)))
value2 = np.exp(rm.predict(sample_2))
print('percent increase in house price when living measure in increased by a unit ' )
print(((value2-value1)/value1)*100)
# Drop the not required variable
house_df_model=house_df.drop(['cid','dayhours','total_area'],axis=1)
# Let's first normalize the data
house_df_model_z = house_df_model.apply(zscore)
X = house_df_model_z.drop('price',axis=1)
y = house_df_model_z[['price']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)
Model = []
RMSE = []
R_sq = []
cv = KFold(10, random_state = 1)
def model_score(name,model,x,y):
Model.append(name)
RMSE.append(np.sqrt((-1) * cross_val_score(model, x, y, cv=cv,
scoring='neg_mean_squared_error').mean()))
R_sq.append(cross_val_score(model, x, y, cv=cv, scoring='r2').mean())
names = ['K Neighbors Regressor','Support Vector Regressor(rbf)','Support Vector Regressor(linear)',
'Support Vector Regressor(poly)','Decision Tree Regressor','Random Forest Regressor']
models = [KNeighborsRegressor(),
SVR(kernel='rbf'),SVR(kernel='linear'),SVR(kernel='poly'),
DecisionTreeRegressor(random_state=3),RandomForestRegressor(random_state=3)]
for name, model in zip(names,models):
model_score(name,model,X_train,y_train)
evaluation = pd.DataFrame({'Model': Model,
'RMSE': RMSE,
'R Squared': R_sq})
print(evaluation)
evaluation.sort_values(by='R Squared', ascending=False,inplace=True)
f, ax = plt.subplots(figsize=(17, 15))
plt.xlabel('Score')
plt.title('Regressor Score')
sns.set_color_codes("muted")
sns.barplot(x='R Squared', y='Model', data=evaluation, color="g")
for i, v in enumerate(evaluation['RMSE'].round(3)):
ax.text(0.01, i+0.05, ("RMSE",v),fontsize=13,color='Black',weight='light')
for i, v in enumerate(evaluation['R Squared'].round(3)):
ax.text(0.01, i+0.25, ("R_sq",v),fontsize=13,color='Black',weight='light')
plt.show()
MSE_Score = []
Performance_Test = []
Alg = []
gbrg = GradientBoostingRegressor(random_state=3)
gbrg = gbrg.fit(X_train, y_train)
print('Train Score: ',gbrg.score(X_train, y_train))
print('-'*40)
print('Test Score:',gbrg.score(X_test, y_test))
y_pred = gbrg.predict(X_test)
print('-'*40)
Alg.append('Gradient boosting')
MSE_Score.append(gbrg.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
modelxgb = XGBRegressor()
modelxgb.fit(X_train, y_train, eval_metric='rmse',early_stopping_rounds=50, eval_set=[(X_test, y_test)], verbose=False)
print('Train Score :',modelxgb.score(X_train, y_train))
print('-'*40)
print('Test Score:',modelxgb.score(X_test, y_test))
y_pred = modelxgb.predict(X_test)
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Alg.append('XGBoost')
MSE_Score.append(modelxgb.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
adb_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=12),random_state=3)
adb_model.fit(X_train, y_train)
print('Train Score:',adb_model.score(X_train, y_train))
print('-'*40)
print('Test Score: ',adb_model.score(X_test, y_test))
y_pred = adb_model.predict(X_test)
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Alg.append('ADABoost')
MSE_Score.append(adb_model.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Ensemble_model_df = pd.DataFrame({"Models":Alg, "R Squared":MSE_Score,"RMSE":Performance_Test})
Ensemble_model_df.sort_values(by='R Squared', ascending=False,inplace=True)
f, ax = plt.subplots(figsize=(15, 7))
plt.xlabel('Score')
plt.title('Ensemble Regressor Model Score')
sns.set_color_codes("muted")
g = sns.barplot("R Squared","Models",data = Ensemble_model_df,color="g")
for i, v in enumerate(Ensemble_model_df['RMSE'].round(3)):
ax.text(0.01, i+0.05, ("RMSE",v),fontsize=13,color='Black',weight='light')
for i, v in enumerate(Ensemble_model_df['R Squared'].round(3)):
ax.text(0.01, i+0.25, ("R_sq",v),fontsize=13,color='Black',weight='light')
plt.show()
# Continuing after homoscedasticity and label encoding
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
housing_df_model = housing_df_model_later
housing_df_model = housing_df.drop({'dayhours','cid','total_area','zipcode'}, axis=1)
le.fit(housing_df_model['age_binned'])
housing_df_model['age_binned']=le.transform(housing_df_model['age_binned'])
le.fit(housing_df_model['age_renovated_binned'])
housing_df_model['age_renovated_binned']=le.transform(housing_df_model['age_renovated_binned'])
print( housing_df_model.columns)
from sklearn import metrics
from sklearn.metrics import recall_score, confusion_matrix, precision_score, accuracy_score
MSE_Score = []
Performance_Test = []
Alg = []
X = housing_df_model.drop({'price'}, axis=1)
y = housing_df_model[['price']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=1)
rnd_model = RandomForestRegressor(random_state=3)
rnd_model = rnd_model.fit(X_train, y_train)
print('Train Score: ',rnd_model.score(X_train, y_train))
print('-'*40)
print('Test Score:',rnd_model.score(X_test, y_test))
y_pred = rnd_model.predict(X_test)
print('-'*40)
Alg.append('Gradient boosting')
MSE_Score.append(rnd_model.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
adb_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=12),random_state=3)
adb_model.fit(X_train, y_train)
print('Train Score:',adb_model.score(X_train, y_train))
print('-'*40)
print('Test Score: ',adb_model.score(X_test, y_test))
y_pred = adb_model.predict(X_test)
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Alg.append('ADABoost')
MSE_Score.append(adb_model.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
There are several approaches to hyperparameter tuning
#Random Forest Regressor
rf_model = RandomForestRegressor(n_jobs=-1,random_state=3)
# Try different numbers of n_estimators - this will take a minute or so
estimators = np.arange(10, 400, 25)
scores = []
for n in estimators:
rf_model.set_params(n_estimators=n)
rf_model.fit(X_train, y_train)
scores.append(rf_model.score(X_test, y_test))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)
param_grid = [
{'n_estimators': [135,185,210], 'max_features': [15,10]},
{'bootstrap': [True]},
]
forest_reg = RandomForestRegressor(random_state=1)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)
rnd_reg_best = grid_search.best_estimator_
print('Train Score')
print(' ')
print(rnd_reg_best.score(X_train, y_train))
print('Test Score')
print(' ')
print(rnd_reg_best.score(X_test, y_test))
y_pred = rnd_reg_best.predict(X_test)
plt.scatter(y_test['price'], y_pred)
Alg.append('Random Forest')
MSE_Score.append(metrics.mean_squared_error(y_test, y_pred))
Performance_Test.append(rnd_reg_best.score(X_test, y_test))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
adb_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=12), learning_rate=0.01, loss='linear',
n_estimators=1200, random_state=1920)
adb_model.fit(X_train, y_train)
print('Train Score:',adb_model.score(X_train, y_train))
print('-'*40)
print('Test Score: ',adb_model.score(X_test, y_test))
y_pred = adb_model.predict(X_test)
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
Alg.append('ADABoost')
MSE_Score.append(adb_model.score(X_test, y_test))
Performance_Test.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
def gridsearch_hyperparam_tunning(model_name,reg_model,param_grid):
model = reg_model
ran_model = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5, verbose=2, n_jobs = -1,scoring='neg_mean_squared_error', return_train_score=True)
# Fit the random search model
ran_model.fit(X_train,y_train.values.ravel())
print(ran_model.best_params_)
y_hat = ran_model.predict(X_test)
print( model_name + " Prediction R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print(model_name + ' Prediction RMSE:', np.sqrt(mean_squared_error(y_hat,y_test)))
return ran_model.best_estimator_
def randomised_hyperparam_tunning(model_name,reg_model,param_grid,n_iter_p):
model = reg_model
ran_model = RandomizedSearchCV(estimator = model, param_distributions = param_grid, n_iter = n_iter_p, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
ran_model.fit(X_train,y_train.values.ravel())
print(ran_model.best_params_)
y_hat = ran_model.predict(X_test)
print( model_name + " Prediction R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print(model_name + ' Prediction RMSE:', np.sqrt(mean_squared_error(y_hat,y_test)))
return ran_model.best_estimator_
rf_param_grid = {'n_estimators': [110,150,180,225,280,350],
'max_features': ['auto', 'sqrt',10,15,20],
'bootstrap': [True],
'max_depth' : [5,10,15],
}
forest_reg = RandomForestRegressor(random_state=3)
rnd_reg_grid_best = gridsearch_hyperparam_tunning("Random Forest Regressor",forest_reg,rf_param_grid)
rnd_reg_grid_best.fit(X_train,y_train)
y_pred = rnd_reg_grid_best.predict(X_test)
print('Train Score: ',rnd_reg_grid_best.score(X_train, y_train))
print('-'*40)
print('Test Score: ',rnd_reg_grid_best.score(X_test, y_test))
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
plt.scatter(y_test['price'], y_pred)
rf_ran_param_dist = {"n_estimators":sp_randint(50, 400),
"max_depth": sp_randint(1, 15),
"max_features":sp_randint(1, 15),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True],
"max_features" :['auto', 'sqrt']
}
rnd_reg_random_best = randomised_hyperparam_tunning("RandomForest Regressor",RandomForestRegressor(),rf_ran_param_dist,25)
rnd_reg_random_best.fit(X_train,y_train)
y_pred = rnd_reg_random_best.predict(X_test)
print('Train Score: ',rnd_reg_random_best.score(X_train, y_train))
print('-'*40)
print('Test Score: ',rnd_reg_random_best.score(X_test, y_test))
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
x_ax, y_ax = (list(x) for x in zip(*sorted(zip(rnd_reg_grid_best.feature_importances_, X_train.columns),
reverse = True)))
d = {'feature_scores': x_ax, 'feature_names': y_ax}
result_df = pd.DataFrame(data=d)
fig, ax = plt.subplots(figsize=(12,7))
ax = sns.barplot(x='feature_scores', y='feature_names', data=result_df, palette="coolwarm")
plt.title('RandomForestRegressor Feature Importances', fontsize=16)
plt.xlabel('Feature Scores', fontsize=14)
plt.ylabel('Names of the Features', fontsize=14)
Latitude , quality , living measure , furnished
var_drop = []
for i,v in enumerate(y_ax) :
if x_ax[i] < 0.0090 :
var_drop.append(v)
print(x_ax[i])
print(y_ax[i])
X_train_less = X_train.drop(var_drop, axis=1)
X_test_less = X_test.drop(var_drop, axis=1)
def gridsearch_with_less_param(model_name,reg_model,param_grid):
model = reg_model
ran_model = GridSearchCV(estimator = model, param_grid = param_grid, cv = 5, verbose=2, n_jobs = -1,scoring='neg_mean_squared_error', return_train_score=True)
# Fit the random search model
ran_model.fit(X_train_less,y_train.values.ravel())
print(ran_model.best_params_)
y_hat = ran_model.predict(X_test_less)
print( model_name + " Prediction R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print(model_name + ' Prediction RMSE:', np.sqrt(mean_squared_error(y_hat,y_test)))
return ran_model.best_estimator_
rf_param_grid = {'n_estimators': [180,225,250,280,300,350,400],
'max_features': ['auto', 'sqrt',5,10,14],
'bootstrap': [True],
'max_depth' : [5,10,15],
}
forest_reg = RandomForestRegressor(random_state=1)
rnd_reg_grid_best = gridsearch_with_less_param("Random Forest Regressor",forest_reg,rf_param_grid)
rnd_reg_grid_best.fit(X_train_less, y_train)
y_pred = rnd_reg_grid_best.predict(X_test_less)
print('Train Score: ',rnd_reg_grid_best.score(X_train_less, y_train))
print('-'*40)
print('Test Score: ',rnd_reg_grid_best.score(X_test_less, y_test))
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
def randomised_with_less_param(model_name,reg_model,param_grid,n_iter_p):
model = reg_model
ran_model = RandomizedSearchCV(estimator = model, param_distributions = param_grid, n_iter = n_iter_p, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
ran_model.fit(X_train_less,y_train.values.ravel())
print(ran_model.best_params_)
y_hat = ran_model.predict(X_test_less)
print( model_name + " Prediction R2-score: {}".format(round(r2_score(y_hat, y_test),4)))
print(model_name + ' Prediction RMSE:', np.sqrt(mean_squared_error(y_hat,y_test)))
return ran_model.best_estimator_
rf_ran_param_dist = {"n_estimators":sp_randint(50, 400),
"max_depth": sp_randint(1, 15),
"max_features":sp_randint(1, 15),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True],
"max_features" :['auto', 'sqrt']
}
rnd_reg_random_best = randomised_with_less_param("RandomForest Regressor",RandomForestRegressor(),rf_ran_param_dist,25)
rnd_reg_random_best.fit(X_train_less, y_train)
y_pred = rnd_reg_random_best.predict(X_test_less)
print('Train Score: ',rnd_reg_random_best.score(X_train_less, y_train))
print('-'*40)
print('Test Score: ',rnd_reg_random_best.score(X_test_less, y_test))
print('-'*40)
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
sample_1 = X_test_less.tail(1)
sample_1['quality'] = 7
print('an example where quality is =' + str(sample_1['quality'].values))
print('pedicted house value is ')
print(np.exp(rnd_reg_grid_best.predict(sample_1)))
print('--------------------------------')
sample_1['quality'] = 6
print('have changed quality for the same example and value of quality is =' + str(sample_1['quality'].values))
print('pedicted house value this time is ')
print(np.exp(rnd_reg_grid_best.predict(sample_1)))
X = housing_df_model.drop({'price'}, axis=1)
y = housing_df_model[['price']]
# configure bootstrap
n_iterations = 100
n_size = int(len(X) * 0.50)
values = house_df_model.values
# run bootstrap
stats = list()
for i in range(n_iterations):
# prepare train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30 , random_state=i)
X_train_less = X_train.drop(var_drop, axis=1)
X_test_less = X_test.drop(var_drop, axis=1)
# fit model
rfb = rnd_reg_grid_best.fit(X_train_less, y_train)
# evaluate model
score = rfb.score(X_test_less, y_test)
#print(score)
stats.append(score)
# plot scores
plt.hist(stats)
plt.show()
# confidence intervals
alpha = 0.95
p = ((1.0-alpha)/2.0) * 100
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
rnd_reg_grid_best
# Final list of features we have to use for Random Forest
print(X_train_less.columns)
We have done our analysis on various parametric and non-parametric algorithms and by considering below points
We have concluded that we will keep both one parametric and one non parametric models in production. These models are
Random Forest Model will be using 14 parameter whereas Linear Regression Model will be using all 25 parameters.
We will keep monitoring deviation (i.e. RMSE) from actual prediction and its root cause and will keep fine tuning our model time to time for better result.